from bs4 import BeautifulSoup
import urllib.request
import time
time.sleep(2)
import re
import pdfkit
import pandas

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#导入新闻id
newsid = open('D://pengpai/id.txt','r',encoding='utf-8') #路径自己修改
idlist = newsid.read().split('\n')

title = []
up_time = []
url =  []
try:
    for i in idlist:
        #获取新闻内容
        newsurl = 'https://www.thepaper.cn/newsDetail_forward_'+i
        html = urllib.request.Request(url=newsurl,headers=headers)
        data = urllib.request.urlopen(html).read().decode('utf-8','ignore')
        htmlurl = urllib.request.urlopen(html).geturl()
        if htmlurl=='https://www.thepaper.cn/newsDetail_forward_'+i:
            url.append(htmlurl)

            soup = BeautifulSoup(data,'lxml')
            newstitle = soup.select('title')[0].get_text()
            title.append(newstitle)
            print(len(title))

            pat = '<p>.*?(\d+-\d+-\d+ \d+:\d+).*?</p>'
            newstime = str(re.compile(pat,re.DOTALL).findall(data)).replace("['","")
            time = newstime.replace("']","")
            print(time)

            up_time.append(time)
            print(len(up_time))

        else:
            print(i,htmlurl)
    news = {'title':title,'url':url,'time':up_time}
    all_data = pandas.DataFrame(news)
    all_data.to_csv('D://pengpai/news.csv', encoding='utf-8-sig') #路径自己修改
    print(all_data)
except Exception as err:
    print(err)
